###############################################
### Project: Communicating in an eventful   ###
###          campaign                       ###
### Task:    Validation                     ###
### Title:   02_Validation.R                ###
###############################################

#---------------------------------------------------------------------------------
# Description:
#
# This script calculates validation statistics for the performed computer-based
# text analysis. The classification results delivered by the cross-domain BERT
# application is compared against human coding ("gold-standard").
#---------------------------------------------------------------------------------

#---------------------------------------------------------------------------------------------------------------------

# Preparation

## load packages
library(tidyverse)
library(xlsx)
library(newsmap)
library(irr)

## load classified text data (press releases)
df <- read.csv("Classified_Pressreleases_20210401_20210926.csv", encoding = "UTF-8")

## adjust party names
df$party <- gsub("GrÃ¼ne", "Grüne", df$party)

## load human coded press releases
df_coded <- read.xlsx("Validation_Data_coded.xlsx", sheetIndex = 1, colIndex = 2:14)

## load codebook
codebook <- read.xlsx("Codebook.xlsx", sheetIndex = 1)
codes <- codebook[,5:6]
colnames(codes) <- c("coder_issue", "coder_cmp_code")

df_coded <- left_join(df_coded, codes, by="coder_cmp_code")# %>% 

df$text <- NULL
df_final <- left_join(df_coded, df)

## streamline variable coding
df_final$coder_issue[is.na(df_final$coder_issue)] <- "NA"
df_final$issue[is.na(df_final$issue)] <- "NA"

df_final$coder_issue <- gsub("_", " ", df_final$coder_issue)
df_final$issue <- gsub("_", " ", df_final$issue)

#---------------------------------------------------------------------------------------------------------------------

# Validation statistics (accuracy; precision, recall and f1 score; kappa)

## Classification: Type (policy-related vs. not policy-related)
acc_type <- round(length(which(df_final$type == df_final$coder_type)) / nrow(df_final),
                  digits = 2)
acc_type
df_acc_type <- accuracy(df_final$type, df_final$coder_type)
df_acc_type
kappa_type <- kappa2(cbind(df_final$type, df_final$coder_type))
kappa_type

## Classification: Issue
acc_issue <- round(length(which(df_final$issue == df_final$coder_issue)) / nrow(df_final),
                   digits = 2)
acc_issue
df_acc_issue <- accuracy(df_final$issue, df_final$coder_issue)
df_acc_issue
kappa_issue <- kappa2(cbind(df_final$issue, df_final$coder_issue))
kappa_issue
